function [y_out] = ACCENSE_embed(X,perplexity,filename)

%Computes 2-dimensional embedding using t-SNE

%Karthik Shekhar, MIT 2013 (Algorithmic parts relating to t-SNE have been
%adapted from the original implementation by Laurens van der Maaten)

%Requires

%Inputs:
%   X - N x D matrix of datapoints. For mass cytometry, code assumes that points have already been arcsinh transformed 
%   perplexity - number, typically between 10 and 50
%   filename - Name of .mat file which stores the outputs

%Functions
% tsne_p.m - Performs the numerical gradient descent procedure to compute
% the embeddings

%arg-check

if (nargin < 1)
    fprintf('Error : Input data matrix is missing. Aborting ...\n');
    return;
end

if size(X,1) > 25000
    fprintf('Error : Too many datapoints! X should have fewer than 2.5e4 rows. Aborting ...\n');
    return;
end

if size(X,2) > 50
    fprintf('Warning : Large dimensionality of data. Consider performing PCA and reducing the dimensionality to 50 or so. Proceeding anyway ...\n');
end


if (nargin < 2)
    fprintf('Assuming a perplexity = 30 \n');
    perplexity = 30;
end

if (nargin < 3)
    fprintf('No filename provided. All data will be stored in ACCENSE_out.mat\n');
    filename = 'ACCENSE_out.mat';
end


%Scale X to zero mean and unit variance across each column. This removes
%differences in scale between features

[X_scaled, mu1, sigma] = zscore(X); %mu and sigma are row vectors describing the means and standard deviations of each feature


% Compute pairwise distance matrix
sum_X = sum(X_scaled .^ 2, 2);
D = bsxfun(@plus, sum_X, bsxfun(@plus, sum_X', -2 * (X_scaled * X_scaled')));

disp('Computing Markovian probabilities in the original space...');
tol = 1e-5;

% Initialize some variables
n = size(D, 1);                     % number of instances
P = zeros(n, n);                    % empty probability matrix
beta = ones(n, 1);                  % empty precision vector (1 / variance^2)
logPerp = log(perplexity);          % log of perplexity (= entropy)

% Run over all datapoints
    for i=1:n
        
        if mod(i, 500) == 0
            disp(['Computed Transition P-values ' num2str(i) ' of ' num2str(n) ' datapoints...']);
        end
        
        % Set minimum and maximum values for precision
        betamin = -Inf; 
        betamax = Inf;

        % Compute the Gaussian kernel and entropy for the current precision
        [H, thisP] = Hbeta(D(i, [1:i - 1, i + 1:end]), beta(i));
        
        % Evaluate whether the perplexity is within tolerance
        Hdiff = H - logPerp;
        tries = 0;
        while abs(Hdiff) > tol && tries < 80
            
            % If not, increase or decrease precision
            if Hdiff > 0
                betamin = beta(i);
                if isinf(betamax)
                    beta(i) = beta(i) * 2;
                else
                    beta(i) = (beta(i) + betamax) / 2;
                end
            else
                betamax = beta(i);
                if isinf(betamin) 
                    beta(i) = beta(i) / 2;
                else
                    beta(i) = (beta(i) + betamin) / 2;
                end
            end
            
            % Recompute the values
            [H, thisP] = Hbeta(D(i, [1:i - 1, i + 1:end]), beta(i));
            Hdiff = H - logPerp;
            tries = tries + 1;
        end
        
        % Set the final row of P
        P(i, [1:i - 1, i + 1:end]) = thisP;
    end    
    disp(['Mean value of sigma: ' num2str(mean(sqrt(1 ./ beta)))]);
    disp(['Minimum value of sigma: ' num2str(min(sqrt(1 ./ beta)))]);
    disp(['Maximum value of sigma: ' num2str(max(sqrt(1 ./ beta)))]);

clear D

disp('Running t-SNE')
% Run t-SNE
y_out = tsne(P, 2);
    
%Scatter plot of embeddings (original data)
scatter(y_out(:,1),y_out(:,2),35,'b.')
xlabel('$y_1$','Interpreter','Latex','FontSize',20);
ylabel('$y_2$','Interpreter','Latex','FontSize',20);
set(gca,'FontSize',12);
saveas(gcf,'ACCENSE_embeddings','epsc');


save(filename,'y_out','mu1','sigma');

end

% Function that computes the Gaussian kernel values given a vector of
% squared Euclidean distances, and the precision of the Gaussian kernel.
% The function also computes the perplexity of the distribution.
function [H, P] = Hbeta(D, beta)
    P = exp(-D * beta);
    sumP = sum(P);
    H = log(sumP) + beta * sum(D .* P) / sumP;
    P = P / sumP;
end


function y_out = tsne(P, no_dims)

%TSNE_P Performs  t-SNE on affinity matrix P
%
%
% The function performs symmetric t-SNE on pairwise similarity matrix P 
% to create a low-dimensional map of no_dims dimensions (default = 2).
% The matrix P is assumed to be symmetric, sum up to 1, and have zeros
% on the diagonal.


% Original code by Laurens van der Maaten, 2010
% TU Delft

% Modified by Karthik Shekhar, 2013


    if ~exist('no_dims', 'var') || isempty(no_dims)
        no_dims = 2;
    end
  
    % Initialize some variables
    n = size(P, 1);                                     % number of instances
    momentum = 0.5;                                     % initial momentum
    final_momentum = 0.8;                               % value to which momentum is changed
    mom_switch_iter = 300;                              % iteration at which momentum is changed
    stop_lying_iter = 200;                              % iteration at which lying about P-values is stopped
    max_iter = 3500;                                    % maximum number of iterations
    epsilon = 100;                                      % initial learning rate
    min_gain = .01;                                     % minimum gain for delta-bar-delta
    costTol = 1e-5;                                     % Tolerance for relative improvement in cost function
    
    % Make sure P-vals are set properly
    P(1:n + 1:end) = 0;                                 % set diagonal to zero
    P = 0.5 * (P + P');                                 % symmetrize P-values
    P = max(P ./ sum(P(:)), realmin);                   % make sure P-values sum to one
    const = sum(P(:) .* log(P(:)));                     % constant in KL divergence
    
    P = P * 4;                                      % "Early exaggeration" to find better local-minima
    
    
    % Initialize the solution
    y_out = .0001 * randn(n, no_dims);
    
    y_incs  = zeros(size(y_out));
    gains = ones(size(y_out));
    
    
    flagRel = 0;
    
    % Run the iterations
    for iter=1:max_iter
        
        if iter < 10
            disp('Iter < 10');
        end
        
        % Compute joint probability that point i and j are neighbors
        sum_ydata = sum(y_out .^ 2, 2);
        num = 1 ./ (1 + bsxfun(@plus, sum_ydata, bsxfun(@plus, sum_ydata', -2 * (y_out * y_out')))); % Student-t distribution
        num(1:n+1:end) = 0;                                                 % set diagonal to zero
        Q = max(num ./ sum(num(:)), realmin);                               % normalize to get probabilities
        
        % Compute the gradients (faster implementation)
        L = (P - Q) .* num;
        y_grads = 4 * (diag(sum(L, 1)) - L) * y_out;
            
        % Update the solution
        gains = (gains + .2) .* (sign(y_grads) ~= sign(y_incs)) ...         % note that the y_grads are actually -y_grads
              + (gains * .8) .* (sign(y_grads) == sign(y_incs));
        gains(gains < min_gain) = min_gain;
        y_incs = momentum * y_incs - epsilon * (gains .* y_grads);
        y_out = y_out + y_incs;
        y_out = bsxfun(@minus, y_out, mean(y_out, 1));
        
        % Update the momentum if necessary
        if iter == mom_switch_iter
            momentum = final_momentum;
        end
        if iter == stop_lying_iter
            P = P ./ 4;
        end
        
        % Print out progress
        if ~rem(iter, 10)
            cost = const - sum(P(:) .* log(Q(:)));
            disp(['Iteration ' num2str(iter) ': error is ' num2str(cost)]);
            
            
            if flagRel == 1
                
                disp(['Iteration ' num2str(iter) ': Relative improvement in cost is ' num2str((cost0 - cost)/cost0)]);
                
                if (cost0-cost)/cost0 < costTol
                    disp(['Exiting as relative improvement in cost < 1e-5']);
                    break
                end
                
                cost0 = cost;
            end
            
            if flagRel == 0 && iter > stop_lying_iter
                cost0 = cost;
                flagRel = 1;
            
            end
            
            
        end
        
        
    end
    
end  